Regressions

PCA of Volume

Here, we use scikit to determine the PCA of our volume. This will aid us in determining better axes along which to sample our data.


In [1]:
from sklearn.decomposition import PCA
import numpy
import csv

In [18]:
data = open('../data/data.csv', 'r').readlines()
fieldnames = ['x', 'y', 'z', 'unmasked', 'synapses']
reader = csv.reader(data)
reader.next()

rows = [[int(col) for col in row] for row in reader]

sorted_x = sorted(list(set([r[0] for r in rows])))
sorted_y = sorted(list(set([r[1] for r in rows])))
sorted_z = sorted(list(set([r[2] for r in rows])))
print len(sorted_x)
print len(sorted_y)
print len(sorted_z)

volume = numpy.ndarray((len(sorted_x), len(sorted_y), len(sorted_z)))
for row in rows:
    if row[-1] != 0:
        volume[sorted_x.index(row[0]), sorted_y.index(row[1]), sorted_z.index(row[2])] = row[-1]


108
52
11

In [3]:
pca = PCA(n_components=1)
transform = pca.fit_transform(volume[:,:,0])

Regressions

We run linreg over our data and a test set to compare their outcome (plotted below). We then use sklearn.cross_validation to use $x$ training data to predict the $total-x$ test data.


In [6]:
mip = numpy.ndarray(volume.shape[:-1])
for x in range(volume.shape[0]):
    mip[x] = numpy.max(volume[x])

In [17]:
%matplotlib inline
from sklearn import linear_model

# Split the data into training/testing sets
x_train = mip[:-20]
x_test = mip[-20:]

# Split thxe targets into training/testing sets
y_train = sorted_z[:5]
y_test = sorted_z[5:11]

print len(x_train)
print sorted_z
print y_train
print y_test
# Create linear regression object
regr = linear_model.LinearRegression()

# Train the model using the training sets
regr.fit(x_train, y_train)

# The coefficients
print('Coefficients: \n', regr.coef_)
# The mean square error
print("Residual sum of squares: %.2f"
      % numpy.mean((regr.predict(x_test) - y_test) ** 2))
# Explained variance score: 1 is perfect prediction
print('Variance score: %.2f' % regr.score(x_test, y_test))

# Plot outputs
plt.scatter(x_test, y_test,  color='black')
plt.plot(x_test, regr.predict(x_test), color='blue',
         linewidth=3)

plt.xticks(())
plt.yticks(())

plt.show()


88
[55, 166, 277, 388, 499, 610, 721, 832, 943, 1054, 1165]
[55, 166, 277, 388, 499]
[610, 721, 832, 943, 1054, 1165]
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-17-4c6f1013a93a> in <module>()
     18 
     19 # Train the model using the training sets
---> 20 regr.fit(x_train, y_train)
     21 
     22 # The coefficients

/Library/Python/2.7/site-packages/sklearn/linear_model/base.pyc in fit(self, X, y, sample_weight)
    425         n_jobs_ = self.n_jobs
    426         X, y = check_X_y(X, y, accept_sparse=['csr', 'csc', 'coo'],
--> 427                          y_numeric=True, multi_output=True)
    428 
    429         if ((sample_weight is not None) and np.atleast_1d(sample_weight).ndim > 1):

/Library/Python/2.7/site-packages/sklearn/utils/validation.pyc in check_X_y(X, y, accept_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, multi_output, ensure_min_samples, ensure_min_features, y_numeric, warn_on_dtype, estimator)
    518         y = y.astype(np.float64)
    519 
--> 520     check_consistent_length(X, y)
    521 
    522     return X, y

/Library/Python/2.7/site-packages/sklearn/utils/validation.pyc in check_consistent_length(*arrays)
    174     if len(uniques) > 1:
    175         raise ValueError("Found arrays with inconsistent numbers of samples: "
--> 176                          "%s" % str(uniques))
    177 
    178 

ValueError: Found arrays with inconsistent numbers of samples: [ 5 88]

In [ ]: